from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Overall idea and required video content: how to prevent future accidents in France?
Ideer til descriptive
# Start writing code here...
# Define imports
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import pandas as pd
import folium
import plotly.express as px
# fix random generator seed (for reproducibility of results)
np.random.seed(42)
%matplotlib inline
sns.set_style('darkgrid')
Load data for 2018
char_2018 = pd.read_csv('work/Dataset/caracteristiques-2018.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2018 = pd.read_csv('work/Dataset/lieux-2018.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2018 = pd.read_csv('work/Dataset/usagers-2018.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2018 = pd.read_csv('work/Dataset/vehicules-2018.csv',usecols=['Num_Acc','catv'])
Load data from 2017
char_2017 = pd.read_csv('work/Dataset/caracteristiques-2017.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2017 = pd.read_csv('work/Dataset/lieux-2017.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2017 = pd.read_csv('work/Dataset/usagers-2017.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2017 = pd.read_csv('work/Dataset/vehicules-2017.csv',usecols=['Num_Acc','catv'])
Load data from 2016
char_2016 = pd.read_csv('work/Dataset/caracteristiques_2016.csv',usecols = ['Num_Acc','an','mois','jour','hrmn','lum','atm','com','lat','long','col'])
places_2016 = pd.read_csv('work/Dataset/lieux_2016.csv',usecols=['Num_Acc','catr','nbv','prof','plan','surf','infra','situ'])
users_2016 = pd.read_csv('work/Dataset/usagers_2016.csv',usecols=['Num_Acc','catu','grav','sexe','trajet','an_nais','secu'])
vehicles_2016 = pd.read_csv('work/Dataset/vehicules_2016.csv',usecols=['Num_Acc','catv'])
# 2018 data
data_2018 = pd.merge(users_2018, char_2018, on='Num_Acc')
data_2018 = pd.merge(data_2018, places_2018, on='Num_Acc')
data_2018 = pd.merge(data_2018, vehicles_2018, on='Num_Acc')
data_2018 = data_2018.dropna()
print("The merged dataset now contains: ", len(data_2018), "observations.")
# 2017 data
data_2017 = pd.merge(users_2017, char_2017, on='Num_Acc')
data_2017 = pd.merge(data_2017, places_2017, on='Num_Acc')
data_2017 = pd.merge(data_2017, vehicles_2017, on='Num_Acc')
data_2017 = data_2017.dropna()
print("The merged dataset now contains: ", len(data_2017), "observations.")
# 2016 data
data_2016 = pd.merge(users_2016, char_2016, on='Num_Acc')
data_2016 = pd.merge(data_2016, places_2016, on='Num_Acc')
data_2016 = pd.merge(data_2016, vehicles_2016, on='Num_Acc')
data_2016 = data_2016.dropna()
print("The merged dataset now contains: ", len(data_2016), "observations.")
# Concatenate data sets
data = pd.concat([data_2018,data_2017],axis=0)
data = pd.concat([data,data_2016],axis=0)
# Set the year to eg 2018 instead of 18
data.an = data.an+2000
data['latS'] = data.lat/10**5
data['longS'] = data['long']/10**5
data = data[data['latS'] >= 40]
data = data[data['latS'] <= 51.25]
data = data[data['longS'] >= -5]
data = data[data['longS'] <= 9.8]
names = {'an': 'year', 'grav': 'severity', 'sexe': 'gender', 'an_nais': 'birth', 'jour': 'day', 'hrmn': 'HHMM',
'catr': 'roadtype', 'mois': 'month','catu': 'User category', 'trajet': 'trip purpose','secu': 'safety',
'nbv': 'traffic lanes', 'surf': 'surface condition', 'infra': 'infrastructure', 'situ': 'situation',
'catv': 'vehicle category','col': 'collision_type'}
data = data.rename(columns = names, inplace = False)
# Set the HH/mm to eg 0012 instead of 12
data['HHMM'] = data.HHMM.apply(lambda x: '000'+str(x) if (len(str(x))==1) else '00'+str(x) if (len(str(x))==2) else '0'+str(x) if (len(str(x))==3) else str(x))
data['minute'] = data.HHMM.apply(lambda x: x[2:])
data['hour'] = data.HHMM.apply(lambda x: x[:2])
data.info()
Column including the age of the victims
data['age'] = (data.year - data.birth)
Replace numbers with names of categoies
# Replace weather condition index with weather condition name
num2atm = {1: 'Normal ', 2: 'Light Rain', 3: 'Heavy Rain', 4: 'Snow - hail', 5: 'Fog - smoke', 6: 'Strong wind - storm', 7: 'Dazzling weather', 8: 'Cloudy weather', 9: 'Other'}
data['atm_name'] = data.atm.apply(lambda x: num2atm[x])
num2severity = {1: 'Unscathed ', 2: 'Killed', 3: 'Hospitalized wounded', 4: 'Light injury'}
data['severity_name'] = data.severity.apply(lambda x: num2severity[x])
num2lum = {1: 'Full day', 2: 'Twilight or dawn', 3: 'Night without public lighting', 4:'Night with public lighting not lit', 5:'Night with public lighting on'}
data['lum_name'] = data.lum.apply(lambda x: num2lum[x])
num2sex = {1: 'Male', 2: 'Female'}
data['gender'] = data.gender.apply(lambda x: num2sex[x])
num2userCategory = {1: 'Driver', 2: 'Passenger', 3:'Pedestrian', 4:'Skateboarder or scooter'}
data['User category'] = data['User category'].apply(lambda x: num2userCategory[x])
num2tripPurpose = {0: 'Not denoted', 1: 'Home - work', 2: 'Home - school', 3:'Shopping', 4:'Professional use', 5:'Walk - leisure', 9:'Other'}
data['trip purpose'] = data['trip purpose'].apply(lambda x: num2tripPurpose[x])
num2safety = {1: 'seatbelt', 2:'helmet', 11:'seatbelt yes', 12:'seatbelt no',13:'seatbelt not denoted', 21:'helmet yes', 22:'helmet no', 23:'helmet not denoted', 3:'Children device',31:'Children device yes', 32:'Children device no', 33:'Children device not denoted', 41:'Reflective equipment yes',42:'Reflective equipment no',43:'Reflective equipment not denoted',91:'Other yes',92:'Other no', 93:'Other not denoted'}
data['safety'] = data['safety'].apply(lambda x: num2safety[x])
num2prof = {0: 'Not denoted', 1: 'Straight', 2: 'Slope', 3:'Hilltop', 4:'Coastline'}
data['prof'] = data['prof'].apply(lambda x: num2prof[x])
num2plan = {0: 'Not denoted', 1: 'Rectilinear part', 2: 'Curving to the left', 3:'Curving to the right', 4:'S curve'}
data['plan'] = data['plan'].apply(lambda x: num2plan[x])
num2surfaceCondition = {0: 'Not denoted', 1: 'Normal', 2: 'Wet', 3:'puddles', 4:'flooded',5:'snowy',6:'mud',7:'icy',8:'fats - oil',9:'Other'}
data['surface condition'] = data['surface condition'].apply(lambda x: num2surfaceCondition[x])
num2infrastructure = {0: 'Not denoted', 1: 'Underground - tunnel', 2: 'Bridge - flyover', 3:'Exchanger or connection sling', 4:'Track',5:'Arranged crossroads',6:'Pedestrian zone',7:'Toll zone'}
data['infrastructure'] = data['infrastructure'].apply(lambda x: num2infrastructure[x])
num2roadtype = {0: 'Not denoted', 1: 'Highway', 2: 'National road', 3:'Departmental road', 4:'Communal roads',5:'Outside the public network',6:'Parking lot open to public traffic',7:'Urban metropolis roads', 9:'Other'}
data['roadtype'] = data['roadtype'].apply(lambda x: num2roadtype[x])
num2col = {1:'Two vehicles - frontal', 2: 'Two vehicles - from the rear',
3:'Two vehicles - from the side', 4:'Three vehicles and more - in a chain',
5:'Three or more vehicles - multiple collisions', 6:'Other collision',
7:'No collision'}
#num2col = {1:'Two vehicles', 2: 'Two vehicles', 3:'Two vehicles', 4:'Three or more vehicles',
# 5:'Three or more vehicles', 6:'Other collision', 7:'No collision'}
data['collision_type'] = data['collision_type'].apply(lambda x: num2col[x])
#np.sort(data.collision_type.unique())
data.collision_type.value_counts()
Create vehicle categories from existing vehicle catgories
def crude_vc(cat):
if cat in [1]:
return 'bicycle'
elif cat in [2, 3, 4, 5, 6, 30, 31, 32, 33, 34, 35, 36, 41, 42, 43]:
return 'light vehicles'
elif cat in [7, 8, 9, 10, 11, 12, 13, 14, 15]:
return 'car'
elif cat in [16, 17, 19, 21, 37, 38, 40]:
return 'heavy vehicle'
elif cat in [39]:
return 'train'
elif cat in [18, 20, 50, 60, 80, 99]:
return 'other'
else:
return -1
data['crude_vc'] = data['vehicle category'].apply(lambda x: crude_vc(x))
data['date'] = pd.to_datetime(data.year.astype(str)+'/'+data.month.astype(str)+'/'+data.day.astype(str), format='%Y/%m/%d')
data.groupby('date')['Num_Acc'].count().plot(figsize=(16,8))
plt.title("Number of accidents over time",fontsize=22)
plt.axhline(y = np.mean(data.groupby('date')['Num_Acc'].count()), color = 'r', linestyle = '--')
fig = px.line(data.groupby(['date'])['Num_Acc'].count(), x=data.groupby(['date'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Date", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of accident on a given date",
xaxis_title="Date",
yaxis_title="Number of accidents on a given date",
)
fig.update_traces(line_color='red')
fig.show()
data.groupby(['year','month'])['Num_Acc'].count().plot(kind='bar',figsize=(16,8))
plt.title("Number of accidents per month",fontsize=22)
data.groupby(['year','month'])['Num_Acc'].count().plot(kind = "bar",figsize=(16,8))
plt.title("Number of accidents per month",fontsize=22)
fig = px.bar(data.groupby(['month'])['Num_Acc'].count(), x=data.groupby(['month'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Month", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of reported accidents per month",
xaxis_title="Month",
yaxis_title="Number of accidents per month",
xaxis = dict(
tickmode = 'array',
tickvals = [1,2,3,4,5,6,7,8,9,10,11,12],
ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
)
)
fig.update_traces(marker_color='red')
fig.show()
#df_test = data.groupby(['lum','severity']).size()
df_test = pd.DataFrame({'count' : data.groupby( [ "lum_name", "severity_name"] ).size()}).reset_index()
pivot_df = df_test.pivot(index='lum_name', columns='severity_name', values='count')
axes = pivot_df.plot.bar(stacked=True, figsize=(10,7),rot=45)
axes.legend(loc='center right', bbox_to_anchor=(1.3, 0.45),fancybox=True, shadow=True)
plt.xticks(horizontalalignment="right")
plt.ylabel('Number of accidents')
plt.xlabel('Lum category')
sev_norm = pd.DataFrame(data.groupby(['lum_name','severity_name']).size().unstack())
sev_norm = sev_norm.div(sev_norm.sum(axis=1), axis=0)
#sev_norm = sev_norm.transpose()
sev_norm
axes = sev_norm.plot(kind='bar',stacked=True,rot=45,figsize=(16,8))
#axes = pivot_df.plot.bar(stacked=True, figsize=(10,7),rot=45)
axes.legend(loc='center right', bbox_to_anchor=(1.3, 0.45),fancybox=True, shadow=True)
plt.xticks(horizontalalignment="right")
plt.title("Normalized number of accidents per lum category",fontsize=20)
plt.ylabel("Relative frequency")
plt.xlabel("Lum category")
When normalizing it can be seen that "Night without public light" is the category where most people are killed.
#data['HHMM'] = data.HHMM.apply(lambda x: '000'+str(x) if (len(str(x))==1) else '00'+str(x) if (len(str(x))==2) else '0'+str(x) if (len(str(x))==3) else str(x))
data['minute'] = data.HHMM.apply(lambda x: x[2:])
data['hour'] = data.HHMM.apply(lambda x: x[:2])
plt.figure(figsize=(20,8))
plt.subplot(1,2,1)
plt.title('Number of reported accidents per hour')
data.hour.value_counts().sort_index().plot(kind='bar',rot=0)
plt.xlabel('Hour of the day')
plt.ylabel('Number of reported accidents')
plt.subplot(1,2,2)
plt.title('Number of reported accidents per minute')
data.minute.value_counts().sort_index().plot(kind='bar',rot=0,xticks=np.arange(0, 60, 5))
plt.xlabel('Minute of the day')
plt.ylabel('Number of reported accidents')
plt.show();
Graphs used for presentation
#plt.figure(figsize=(10,8))
#plt.title('Number of reported accidents per hour', fontsize = 32)
#data.hour.value_counts().sort_index().plot(kind='bar',rot=0)
#plt.xlabel('Hour of the day', fontsize = 29)
#plt.ylabel('Number of reported accidents', fontsize = 29)
#plt.show();
fig = px.bar(data.groupby(['hour'])['Num_Acc'].count(), x=data.groupby(['hour'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Hour of day", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of reported accidents per hour of the day",
xaxis_title="Hour of the day",
yaxis_title="Number of accidents per hour of the day",
xaxis = dict(
tickmode = 'array',
tickvals = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
# ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
)
)
fig.update_traces(marker_color='red')
fig.show()
fig = px.bar(data.groupby(['minute'])['Num_Acc'].count(), x=data.groupby(['minute'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Minute of the day", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of reported accidents per minute of the day",
xaxis_title="Minute of the day",
yaxis_title="Number of accidents per minute of the day",
xaxis = dict(
tickmode = 'linear',
tick0 = 0,
dtick = 5
)
)
fig.update_traces(marker_color='red')
fig.show()
fig = px.bar(data.groupby(['age'])['Num_Acc'].count(), x=data.groupby(['age'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Age of victim", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of victims with a given age",
xaxis_title="Age of victim",
yaxis_title="Number of victims with a given age",
xaxis = dict(
tickmode = 'linear',
tick0 = 0,
dtick = 5
)
)
fig.update_traces(marker_color='red')
fig.show()
#plt.figure(figsize=(10,7))
#plt.title('Weather conditions', fontsize = 32)
#data['atm_name'].value_counts().sort_index().plot(kind='bar',rot=45)
#plt.xlabel('Atmospheric condition', fontsize = 29)
#plt.ylabel('Reported accidents in a given weather', fontsize = 23)
#plt.xticks(np.arange(0, int(data.age.max()), step=5))
#plt.show();
fig = px.bar(data.groupby(['atm_name'])['Num_Acc'].count(), x=data.groupby(['atm_name'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Atmospheric condition", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of accident under a given atmospheric conditions",
xaxis_title="Atmospheric condition",
yaxis_title="Number of accidents under a given atmospheric condition",
)
fig.update_traces(marker_color='red')
fig.show()
# Make age into age groups
bins = [-1, 9, 19, 29, 39, 49, 59 ,69 , 79, 89, np.inf]
labels = ['0-9',"10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80-89", "90+"]
data['age_group'] = pd.cut(data['age'],bins,labels=labels)
data['gender_agegroup'] = data.gender + ' ' + data.age_group.astype(str)
data.groupby('gender_agegroup').size().plot(kind='bar',rot=45,figsize=(12,8))
from IPython.core.display import display, HTML
import tempfile
# Reference til kode:
def folium_deepnote_show(m):
tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
m.save(tmp_output_filename)
f = open(tmp_output_filename, "r")
data = f.read()
data_fixed_width = data.replace('width: 100%;height: 100%', 'width: 100%').replace('height: 100.0%;', 'height: 609px;')
display(HTML(data_fixed_width))
lat_lon = data[['longS','latS','severity','date','severity_name','gender','age','lum_name','atm_name','roadtype','traffic lanes',
'surface condition','crude_vc','collision_type']].sample(10000)
locations = lat_lon[['latS','longS']]
locationlist = locations.values.tolist()
print('''this is {}
that is {}'''.format('help',10))
from IPython.display import display
from folium.plugins import FastMarkerCluster
# Possible styles for folium, last do not work properly
t_list = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright", 'Cartodb Positron']
# Define the map with coordinates, style and zoom
m = folium.Map(location=[47.092038, 2.392312],
tiles = t_list[3],
zoom_start = 6)
# Define colours
colours = ['lightgreen','black','darkred','orange']
# Unscathed, killed, hospitalized, light injury
# Add markers for each accident
for point in range(0, len(locationlist[:1000])):
html = '''
Date: {}<br>
Severity: {}<br>
Gender: {}<br>
Age: {}<br>
Lumination: {}<br>
Atmosphere: {}<br>
Roadtype: {}<br>
Traffic lanes: {}<br>
Surface conditions: {}<br>
Vehicle type: {}<br>
Collision type: {}
'''.format(lat_lon.date.iloc[point],lat_lon.severity_name.iloc[point],lat_lon.gender.iloc[point],lat_lon.age.iloc[point],
lat_lon.lum_name.iloc[point],lat_lon.atm_name.iloc[point],lat_lon.roadtype.iloc[point],lat_lon['traffic lanes'].iloc[point],
lat_lon['surface condition'].iloc[point],lat_lon.crude_vc.iloc[point],lat_lon.collision_type.iloc[point])
iframe = folium.IFrame(html,width=300,height=300)
popup = folium.Popup(iframe,max_width=500)
folium.CircleMarker(locationlist[point], radius=2, popup=popup, color=colours[lat_lon['severity'].iloc[point]-1]).add_to(m)
# Add clusters of number of accidents in areas
##m.add_child(FastMarkerCluster(locations[['lat', 'long']].values.tolist()))
# Display map
folium_deepnote_show(m)
lat_lon = data[['longS','latS','severity','date','severity_name','gender','age','lum_name','atm_name','roadtype','traffic lanes',
'surface condition','crude_vc','collision_type']]
lat_lon_special = lat_lon[lat_lon.date == '2016-12-20']
locations = lat_lon_special[['latS','longS']]
locationlist = locations.values.tolist()
from IPython.display import display
from folium.plugins import FastMarkerCluster
# Possible styles for folium, last do not work properly
t_list = ["Stamen Terrain", "Stamen Toner", "Mapbox Bright", 'Cartodb Positron']
# Define the map with coordinates, style and zoom
m = folium.Map(location=[47.092038, 2.392312],
tiles = t_list[3],
zoom_start = 6)
# Define colours
colours = ['lightgreen','black','darkred','orange']
# Unscathed, killed, hospitalized, light injury
# Add markers for each accident
#for point in range(0, len(locationlist)):
# html = '''
# Date: {}<br>
# Severity: {}<br>
# Gender: {}<br>
# Age: {}<br>
# Lumination: {}<br>
# Atmosphere: {}<br>
# Roadtype: {}<br>
# Traffic lanes: {}<br>
# Surface conditions: {}<br>
# Vehicle type: {}<br>
# Collision type: {}
# '''.format(lat_lon_special.date.iloc[point],lat_lon_special.severity_name.iloc[point],lat_lon_special.gender.iloc[point],lat_lon_special.age.iloc[point],
# lat_lon_special.lum_name.iloc[point],lat_lon_special.atm_name.iloc[point],lat_lon_special.roadtype.iloc[point],lat_lon_special['traffic lanes'].iloc[point],
# lat_lon_special['surface condition'].iloc[point],lat_lon_special.crude_vc.iloc[point],lat_lon_special.collision_type.iloc[point])
# iframe = folium.IFrame(html,width=300,height=300)
# popup = folium.Popup(iframe,max_width=500)
# folium.CircleMarker(locationlist[point], radius=2, popup=popup, color=colours[lat_lon['severity'].iloc[point]-1]).add_to(m)
#'''
# Add clusters of number of accidents in areas
#m.add_child(FastMarkerCluster(locations[['latS', 'longS']].values.tolist()))
# Display map
folium_deepnote_show(m)
lat_lon_special.groupby(['longS','latS']).size().sort_values()
sns.scatterplot(data=data,x='long',y='lat')
import matplotlib.cm as cm
# Stjålet fra github-fyrene
def set_color(data, col) :
categories = np.unique(data[col])
colors = cm.rainbow(np.linspace(0, 1, len(categories)))
colordict = dict(zip(categories, colors))
#print(colordict)
return data[col].apply(lambda x: colordict[x])
#categories = np.unique(data['severity'])
#colors = cm.rainbow(np.linspace(0, 1, len(categories)))
#colordict = dict(zip(categories, colors))
#data['color'] = colordict[data['severity']]
# Gravity of the accident
#cm = plt.cm.get_cmap('RdYlBu')
plt.figure(figsize=(10,8))
sc = plt.scatter(data['long'], data['lat'],c = set_color(data,'severity'), s=0.1,vmin=data.severity.min(), vmax=data.severity.max())#,cmap = cm.rainbow)
plt.title('Location of the accident, depending on gravity')
plt.colorbar(sc)
plt.show()
set_color(data, 'severity')
#pip install shap
features = ['User category','gender','trip purpose','safety','year','month','day','hour','minute','lum_name','atm_name','lat','long','roadtype','prof','plan','surface condition','infrastructure','crude_vc','collision_type']
target = ['severity_name']
help_df = data[['severity_name','User category','gender','trip purpose','safety','year','month','day','hour','minute','lum_name','atm_name','lat','long','roadtype','prof','plan','surface condition','infrastructure','crude_vc','collision_type']]
help_df
# Reduce data size to not run out of memory
sample_size = len(help_df[help_df.severity_name == 'Killed'])
print("Sample size:", sample_size)
Unscathed = help_df[help_df.severity_name == 'Unscathed '].sample(sample_size)
Light_injury = help_df[help_df.severity_name == 'Light injury'].sample(sample_size)
Hospitalized_wounded = help_df[help_df.severity_name == 'Hospitalized wounded'].sample(sample_size)
Killed = help_df[help_df.severity_name == 'Killed'].sample(sample_size)
data_balanced = pd.concat([Unscathed,Light_injury,Hospitalized_wounded,Killed],axis=0)
data_balanced.head()
One-hot encode categorical features
from scipy.fftpack import fft, dct
df_features = data_balanced[features]
user = pd.get_dummies(df_features['User category'])
df_features = pd.concat([df_features,user],axis=1)
purpose = pd.get_dummies(df_features['trip purpose'],prefix='Purpose')
df_features = pd.concat([df_features,purpose],axis=1)
safety = pd.get_dummies(df_features['safety'],prefix='Safety')
df_features = pd.concat([df_features,safety],axis=1)
prof = pd.get_dummies(df_features['prof'],prefix='prof')
df_features = pd.concat([df_features,prof],axis=1)
plan = pd.get_dummies(df_features['plan'],prefix='plan')
df_features = pd.concat([df_features,plan],axis=1)
surface = pd.get_dummies(df_features['surface condition'],prefix='surface')
df_features = pd.concat([df_features,surface],axis=1)
infrastructure = pd.get_dummies(df_features['infrastructure'],prefix='infra')
df_features = pd.concat([df_features,infrastructure],axis=1)
gender = pd.get_dummies(df_features.gender)
df_features = pd.concat([df_features,gender],axis=1)
lum = pd.get_dummies(df_features.lum_name,prefix='lum')
df_features = pd.concat([df_features,lum],axis=1)
atm = pd.get_dummies(df_features.atm_name,prefix='atm')
df_features = pd.concat([df_features,atm],axis=1)
roadtype = pd.get_dummies(df_features.roadtype,prefix='roadtype')
df_features = pd.concat([df_features,roadtype],axis=1)
crude = pd.get_dummies(df_features.crude_vc,prefix='vehtype')
df_features = pd.concat([df_features,crude],axis=1)
col_type = pd.get_dummies(df_features.collision_type,prefix='col')
df_features = pd.concat([df_features,col_type],axis=1)
df_features = df_features.drop(columns=['User category','trip purpose','safety','prof','plan','surface condition','infrastructure','lum_name','atm_name','gender','roadtype','crude_vc','collision_type'])
df_features
Split in training and test set
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
# Split data
X = df_features
X.hour = X.hour.astype(int)
X.minute = X.minute.astype(int)
y = data_balanced[target]
#x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
train_perc = 0.75 # percentage of training data
split_point = int(train_perc*len(y))
perm = np.random.permutation(len(y))
ix_train = perm[:split_point] # index of training data
ix_test = perm[split_point:] # index of test data
x_train = X.iloc[ix_train,:]
x_test = X.iloc[ix_test,:]
y_train = y.iloc[ix_train]
y_test = y.iloc[ix_test]
print("num train: %d" % len(y_train))
print("num test: %d" % len(y_test))
# Define logistic regression and predict severity class
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
LogReg = LogisticRegression(multi_class='multinomial')
LogReg.fit(x_train, y_train)
y_pred = LogReg.predict(x_test)
acc_LogReg = round(accuracy_score(y_pred, y_test) * 100, 2)
print("Accuracy logistic regression:", acc_LogReg)
target_names = y.severity_name.unique() #np.unique(y.values)
print(classification_report(y_test, y_pred, target_names=target_names))
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat, square=True, annot=True, cbar=True,fmt='g',cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('true value');
import shap
!pip install xgboost
severity2num = {'Unscathed ': 1,'Killed': 2,'Hospitalized wounded': 3,'Light injury': 4}
y_train_num = y_train.severity_name.apply(lambda x: severity2num[x])
# Import XGBoost for classification
from xgboost import XGBClassifier
# Parameters for XG Boost - Note some of these are for regreession :(
xgb_params = {'base_score': 0.5, 'booster': 'gblinear', 'colsample_bylevel': 0.5,
'colsample_bynode': 0.5, 'gamma': 0, 'gpu_id': 0, 'interaction_constraints': None,
'max_delta_step': 10, 'min_child_weight': 1, 'n_jobs': -1, 'num_parallel_tree': 1,
'random_state': 42, 'reg_alpha': 0, 'reg_lambda': 10, 'scale_pos_weight': 1,
'objective': 'multi:softmax', 'num_class': 4, 'colsample_bytree': 0.3, 'learning_rate': 0.1,
'max_depth': 3,'n_estimators': 200, 'lambda': 10, 'subsample': 0.5,
'tree_method': 'auto', 'validate_parameters': False, 'verbosity': 1,
'interaction_constraints': False, 'monotone_constraints': False}
# Define XG Boost model
xgb = XGBClassifier()
# Set parameters
#xgb.set_params(**xgb_params)
xgb.fit(x_train,y_train_num)
# Calcuate the explaining model
#explainer = shap.Explainer(xgb)
# Get the shap values (values that explain the impact)
#shap_values = explainer(x_train)
# summarize the effects of all the features
#shap.plots.beeswarm(shap_values)
#shap.plots.beeswarm(shap_values.sum(axis=2))
#shap_values.values.sum(axis=2).shape
#shap.plots.beeswarm(shap_values.sum(axis=2))
#shap.summary_plot(shap_values[:,:,1], x_train)
#shap.summary_plot(shap_values.sum(axis=2), x_train, plot_type="bar")
y_pred = xgb.predict(x_test)
y_pred = pd.DataFrame(y_pred,columns=['severity_num']).severity_num.apply(lambda x: num2severity[x])
acc_xgb = round(accuracy_score(y_pred, y_test) * 100, 2)
print("Accuracy XG Boost classifier:", acc_xgb)
target_names = y.severity_name.unique() #np.unique(y.values)
print(classification_report(y_test, y_pred, target_names=target_names))
mat = confusion_matrix(y_test, y_pred)
sns.heatmap(mat, square=True, annot=True, cbar=True,fmt='g',cmap='Blues')
plt.xlabel('predicted value')
plt.ylabel('true value');
for col in x_train.columns:
print(col)
feat_imp = {key: val for key,val in zip(x_train.columns,xgb.feature_importances_)}
feature_importance = pd.DataFrame(feat_imp,index=['feature_importance']).transpose()
feature_importance.feature_importance.sort_values(ascending=False)[:10].plot(kind='bar',figsize=(8,6))
plt.xlabel('Feature')
plt.ylabel('Percentage importance')
plt.title('Feature importance - 10 most important features');
from xgboost import plot_importance
plot_importance(xgb)
plt.show()
#print(len(xgb.feature_importances_))
plt.bar(range(len(xgb.feature_importances_)),xgb.feature_importances_)
plt.show()
fig = px.bar(data.groupby(['month'])['Num_Acc'].count(), x=data.groupby(['month'])['Num_Acc'].count().index, y='Num_Acc',labels={"x": "Month", "Num_Acc": "Number of Accidents"})
fig.update_layout(
title="Number of reported accidents per month",
xaxis_title="Month",
yaxis_title="Number of accidents per month",
xaxis = dict(
tickmode = 'array',
tickvals = [1,2,3,4,5,6,7,8,9,10,11,12],
ticktext = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
)
)
fig.show()